OpenClaw 语音交互与 TTS 实战

语音交互让 AI 助理更加人性化。本文详解 OpenClaw 的 TTS（文本转语音）功能、语音消息、故事讲述等应用场景，让你的 AI 会"说话"。

概述

OpenClaw 的语音功能包括：

🎙️ TTS 文本转语音 - 将文字转换为自然语音
📻 语音消息 - 发送语音而非文字
📖 故事讲述 - 用语音讲述故事、新闻
🎭 角色扮演 - 不同场景使用不同语音
🔊 音频处理 - 播放、录制、转换音频

一、TTS 基础

1.1 TTS 工具

OpenClaw 提供 tts 工具进行文本转语音：

javascript

// 基本用法
await tts({
  text: '你好，这是语音消息',
  channel: 'telegram'  // 可选：指定输出格式
})

// 语音会自动作为附件发送

1.2 支持的语音格式

平台	格式	说明
Telegram	OGG/OPUS	语音消息格式
Discord	MP3	音频文件
WhatsApp	OGG	语音消息
Feishu	MP3	音频文件
通用	WAV/MP3	标准音频格式

1.3 语音配置

yaml

# ~/.openclaw/config.yaml
tts:
  enabled: true
  provider: 'elevenlabs'  # elevenlabs | google | azure | local
  defaultVoice: 'alloy'
  speed: 1.0
  pitch: 1.0
  
  # ElevenLabs 配置
  elevenlabs:
    apiKey: 'encrypted:key_path'
    model: 'eleven_monolingual_v1'
    stability: 0.5
    similarity: 0.75
    
  # Google TTS 配置
  google:
    languageCode: 'zh-CN'
    voiceName: 'zh-CN-Wavenet-D'
    
  # 本地 TTS 配置
  local:
    engine: 'espeak'  # espeak | festival | piper
    voice: 'zh'

二、TTS 应用场景

2.1 语音消息

javascript

// 发送语音消息
async function sendVoiceMessage(target, text) {
  // 转换为语音
  await tts({
    text,
    channel: 'telegram'  // 根据平台选择格式
  })
  
  // 语音会自动附加到消息中
  await message({
    action: 'send',
    target,
    message: text  // 文字版本作为字幕
  })
}

// 使用
await sendVoiceMessage('老大', '早上好！今天是 2026 年 3 月 20 日，星期五。')

2.2 长篇内容朗读

javascript

// 朗读长文章
async function readArticle(articlePath) {
  const content = await read({ path: articlePath })
  
  // 分段朗读（避免单次 TTS 过长）
  const paragraphs = content.split('\n\n').filter(p => p.trim())
  
  for (let i = 0; i < paragraphs.length; i += 5) {
    const batch = paragraphs.slice(i, i + 5).join('\n\n')
    
    await tts({
      text: batch,
      channel: 'telegram'
    })
    
    // 避免速率限制
    if (i + 5 < paragraphs.length) {
      await sleep(2000)
    }
  }
}

// 使用
await readArticle('/docs/guide/article.md')

2.3 实战案例 1：语音早报

javascript

class VoiceBriefing {
  constructor() {
    this.voice = 'alloy'  // 默认语音
  }
  
  async generateVoiceBriefing() {
    console.log('🎙️ 生成语音早报...')
    
    // 1. 获取早报内容
    const briefing = await this.getBriefingContent()
    
    // 2. 转换为语音（分段）
    const sections = this.splitIntoSections(briefing)
    
    for (const section of sections) {
      await tts({
        text: section.text,
        channel: 'telegram'
      })
      
      // 段间暂停
      await sleep(1000)
    }
    
    console.log('✅ 语音早报已发送')
  }
  
  async getBriefingContent() {
    // 获取天气
    const weather = await this.getWeather()
    
    // 获取新闻
    const news = await this.getNews()
    
    // 获取日程
    const schedule = await this.getSchedule()
    
    return `早上好！今天是${new Date().toLocaleDateString('zh-CN')}。

天气：${weather}

今日头条：
${news}

今日日程：
${schedule}

祝你有美好的一天！`
  }
  
  splitIntoSections(content) {
    // 按段落分割，每段不超过 500 字
    const paragraphs = content.split('\n\n')
    const sections = []
    let current = ''
    
    for (const para of paragraphs) {
      if ((current + para).length > 500) {
        sections.push({ text: current })
        current = para
      } else {
        current += '\n\n' + para
      }
    }
    
    if (current) {
      sections.push({ text: current })
    }
    
    return sections
  }
  
  async getWeather() {
    const result = await exec({
      command: 'curl -s "wttr.in/驻马店?format=3"'
    })
    return result.stdout.trim()
  }
  
  async getNews() {
    const search = await web_search({
      query: '今日新闻头条',
      count: 3
    })
    
    return search.results?.slice(0, 3).map((r, i) => 
      `${i + 1}. ${r.title}`
    ).join('\n') || '暂无新闻'
  }
  
  async getSchedule() {
    // 从日历获取
    return '暂无日程安排'
  }
}

// 使用
const briefing = new VoiceBriefing()
await briefing.generateVoiceBriefing()

2.4 实战案例 2：故事讲述

javascript

class StoryTeller {
  constructor() {
    this.voices = {
      narrator: 'alloy',      // 旁白
      character1: 'echo',     // 角色 1
      character2: 'fable',    // 角色 2
      villain: 'onyx'         // 反派
    }
  }
  
  async tellStory(storyText) {
    console.log('📖 开始讲故事...')
    
    // 解析故事，识别角色对话
    const segments = this.parseStory(storyText)
    
    for (const segment of segments) {
      const voice = this.getVoiceForSegment(segment)
      
      // 设置语音（如果支持）
      // 注意：当前 tts 工具可能不支持语音选择
      // 这里展示概念
      
      await tts({
        text: segment.text,
        channel: 'telegram'
      })
      
      // 段间暂停，增强节奏感
      await sleep(segment.pause || 1000)
    }
    
    console.log('✅ 故事讲完')
  }
  
  parseStory(text) {
    const segments = []
    
    // 简单解析：识别"角色："格式的对话
    const lines = text.split('\n')
    
    for (const line of lines) {
      const match = line.match(/^(\w+)：(.+)$/)
      
      if (match) {
        segments.push({
          type: 'dialogue',
          character: match[1],
          text: match[2],
          pause: 500
        })
      } else if (line.trim()) {
        segments.push({
          type: 'narration',
          text: line,
          pause: 1000
        })
      }
    }
    
    return segments
  }
  
  getVoiceForSegment(segment) {
    if (segment.type === 'narration') {
      return this.voices.narrator
    }
    
    // 根据角色选择语音
    switch (segment.character) {
      case '小明':
        return this.voices.character1
      case '小红':
        return this.voices.character2
      case '大魔王':
        return this.voices.villain
      default:
        return this.voices.narrator
    }
  }
}

// 使用
const storyTeller = new StoryTeller()

const story = `
旁白：从前有座山，山里有个庙。
小明：师父，为什么我们每天都要敲木鱼？
旁白：老和尚微微一笑。
老和尚：这是为了修心。
小明：修心是什么？
旁白：老和尚指了指窗外。
老和尚：你看那棵树，它从不说话，却年年发芽。
`

await storyTeller.tellStory(story)

2.5 实战案例 3：电影解说

javascript

class MovieNarrator {
  constructor() {
    this.speed = 1.1  // 稍快的语速
  }
  
  async narrateMovie(summary) {
    console.log('🎬 开始电影解说...')
    
    // 1. 生成解说稿
    const script = await this.generateScript(summary)
    
    // 2. 添加开场白
    const intro = `欢迎来到电影解说。今天我们要讲的是${summary.title}。`
    
    await tts({ text: intro })
    await sleep(2000)
    
    // 3. 分段解说
    const sections = [
      { title: '故事背景', text: script.background },
      { title: '主要人物', text: script.characters },
      { title: '剧情发展', text: script.plot },
      { title: '高潮部分', text: script.climax },
      { title: '结局', text: script.ending },
      { title: '观后感', text: script.review }
    ]
    
    for (const section of sections) {
      // 章节标题
      await tts({ text: `## ${section.title}` })
      await sleep(1000)
      
      // 章节内容（分段）
      const paragraphs = section.text.split('\n\n')
      for (const para of paragraphs) {
        if (para.trim()) {
          await tts({ text: para })
          await sleep(800)
        }
      }
      
      // 章节间暂停
      await sleep(2000)
    }
    
    // 4. 结束语
    const outro = `以上就是${summary.title}的完整解说。感谢收听，我们下期再见。`
    await tts({ text: outro })
    
    console.log('✅ 电影解说完成')
  }
  
  async generateScript(summary) {
    // 使用 AI 生成解说稿
    const result = await sessions_spawn({
      task: `基于以下电影信息生成解说稿：
      
      标题：${summary.title}
      简介：${summary.synopsis}
      时长：${summary.duration}
      
      要求：
      1. 分成背景、人物、剧情、高潮、结局、观后感 6 个部分
      2. 每部分 300-500 字
      3. 语言生动有趣
      4. 适合语音朗读
      
      返回 JSON 格式。`,
      mode: 'run',
      timeoutSeconds: 180
    })
    
    return JSON.parse(result.output)
  }
}

// 使用
const narrator = new MovieNarrator()

await narrator.narrateMovie({
  title: '《肖申克的救赎》',
  synopsis: '银行家安迪被冤枉入狱，在肖申克监狱度过 20 年，最终通过智慧获得自由。',
  duration: '142 分钟'
})

三、高级语音功能

3.1 情感语音

javascript

class EmotionalTTS {
  constructor() {
    this.emotionProfiles = {
      happy: { speed: 1.2, pitch: 1.1, volume: 1.0 },
      sad: { speed: 0.8, pitch: 0.9, volume: 0.8 },
      excited: { speed: 1.4, pitch: 1.2, volume: 1.1 },
      calm: { speed: 0.9, pitch: 1.0, volume: 0.9 },
      serious: { speed: 1.0, pitch: 0.95, volume: 1.0 }
    }
  }
  
  async speakWithEmotion(text, emotion = 'calm') {
    const profile = this.emotionProfiles[emotion] || this.emotionProfiles.calm
    
    // 预处理文本，添加情感标记
    const processedText = this.addEmotionMarkers(text, emotion)
    
    await tts({
      text: processedText,
      channel: 'telegram'
    })
  }
  
  addEmotionMarkers(text, emotion) {
    // 根据情感调整文本
    switch (emotion) {
      case 'happy':
        return text + ' 😊'
      case 'sad':
        return text + ' 😢'
      case 'excited':
        return text + ' 🎉'
      case 'serious':
        return text
      default:
        return text
    }
  }
}

// 使用
const emotionalTTS = new EmotionalTTS()

await emotionalTTS.speakWithEmotion('太棒了！我们成功了！', 'excited')
await emotionalTTS.speakWithEmotion('很遗憾，这次没有成功。', 'sad')
await emotionalTTS.speakWithEmotion('接下来是重要通知。', 'serious')

3.2 多语言支持

javascript

class MultiLanguageTTS {
  constructor() {
    this.languageVoices = {
      'zh-CN': { voice: 'alloy', lang: '中文' },
      'en-US': { voice: 'nova', lang: 'English' },
      'ja-JP': { voice: 'echo', lang: '日本語' },
      'ko-KR': { voice: 'fable', lang: '한국어' },
      'es-ES': { voice: 'onyx', lang: 'Español' }
    }
  }
  
  async speakInLanguage(text, language) {
    const config = this.languageVoices[language]
    
    if (!config) {
      throw new Error(`不支持的语言：${language}`)
    }
    
    // 检测语言并设置相应配置
    await tts({
      text: `[${config.lang}] ${text}`,
      channel: 'telegram'
    })
  }
  
  async speakMultilingual(texts) {
    // texts: [{ lang: 'zh-CN', text: '...' }, ...]
    
    for (const item of texts) {
      await this.speakInLanguage(item.text, item.lang)
      await sleep(1000)
    }
  }
}

// 使用
const multiTTS = new MultiLanguageTTS()

// 多语言问候
await multiTTS.speakMultilingual([
  { lang: 'zh-CN', text: '你好，欢迎！' },
  { lang: 'en-US', text: 'Hello, welcome!' },
  { lang: 'ja-JP', text: 'こんにちは、ようこそ！' },
  { lang: 'ko-KR', text: '안녕하세요, 환영합니다!' }
])

3.3 语音合成队列

javascript

class TTSQueue {
  constructor() {
    this.queue = []
    this.processing = false
  }
  
  async add(text, priority = 'normal') {
    const item = {
      text,
      priority,
      addedAt: Date.now()
    }
    
    // 按优先级插入
    if (priority === 'high') {
      this.queue.unshift(item)
    } else {
      this.queue.push(item)
    }
    
    // 启动处理
    if (!this.processing) {
      this.process()
    }
    
    return this.queue.length
  }
  
  async process() {
    this.processing = true
    
    while (this.queue.length > 0) {
      const item = this.queue.shift()
      
      try {
        await tts({
          text: item.text,
          channel: 'telegram'
        })
        
        // 避免速率限制
        await sleep(500)
        
      } catch (error) {
        console.error('TTS 失败:', error)
        
        // 失败重试
        if (item.retries < 3) {
          item.retries = (item.retries || 0) + 1
          this.queue.unshift(item)
          await sleep(2000)
        }
      }
    }
    
    this.processing = false
  }
  
  clear() {
    this.queue = []
  }
  
  getStatus() {
    return {
      queueLength: this.queue.length,
      processing: this.processing
    }
  }
}

// 使用
const ttsQueue = new TTSQueue()

// 添加多个语音任务
await ttsQueue.add('第一条消息')
await ttsQueue.add('第二条消息', 'high')  // 高优先级
await ttsQueue.add('第三条消息')

// 查看状态
console.log(ttsQueue.getStatus())

四、语音与文字混合

4.1 智能选择

javascript

class SmartMessageSender {
  constructor() {
    this.voiceThreshold = 200  // 超过 200 字使用语音
  }
  
  async send(target, content) {
    const length = content.length
    
    if (length > this.voiceThreshold) {
      // 长内容：语音 + 文字摘要
      await this.sendWithVoice(target, content)
    } else {
      // 短内容：纯文字
      await message({
        action: 'send',
        target,
        message: content
      })
    }
  }
  
  async sendWithVoice(target, content) {
    // 生成摘要
    const summary = await this.generateSummary(content)
    
    // 发送语音
    await tts({
      text: content,
      channel: 'telegram'
    })
    
    // 发送文字摘要
    await message({
      action: 'send',
      target,
      message: `📝 文字摘要：\n\n${summary}`
    })
  }
  
  async generateSummary(content) {
    const result = await sessions_spawn({
      task: `将以下内容总结为 100 字以内的摘要：\n\n${content}`,
      mode: 'run'
    })
    return result.output
  }
}

// 使用
const sender = new SmartMessageSender()

// 短消息 - 纯文字
await sender.send('老大', '好的，马上处理。')

// 长消息 - 语音 + 摘要
await sender.send('老大', longArticleContent)

4.2 语音字幕

javascript

async function sendVoiceWithSubtitle(target, text) {
  // 发送语音
  await tts({
    text,
    channel: 'telegram'
  })
  
  // 发送文字作为字幕
  await message({
    action: 'send',
    target,
    message: `📝 ${text}`
  })
}

// 使用
await sendVoiceWithSubtitle('老大', '这是一条语音消息，文字版本作为字幕方便阅读。')

五、语音内容创作

5.1 播客生成

javascript

class PodcastGenerator {
  constructor() {
    this.hostVoice = 'alloy'
    this.guestVoice = 'echo'
  }
  
  async generatePodcast(topic, duration = '10min') {
    console.log(`🎙️ 生成播客：${topic}`)
    
    // 1. 生成播客脚本
    const script = await this.generateScript(topic, duration)
    
    // 2. 开场音乐（模拟）
    await this.playIntroMusic()
    
    // 3. 主持人开场
    await tts({ text: script.intro, channel: 'telegram' })
    await sleep(2000)
    
    // 4. 主体内容（对话形式）
    for (const segment of script.segments) {
      // 主持人
      await tts({ text: `主持人：${segment.host}`, channel: 'telegram' })
      await sleep(1000)
      
      // 嘉宾
      await tts({ text: `嘉宾：${segment.guest}`, channel: 'telegram' })
      await sleep(1500)
    }
    
    // 5. 结束语
    await tts({ text: script.outro, channel: 'telegram' })
    
    // 6. 结束音乐
    await this.playOutroMusic()
    
    console.log('✅ 播客生成完成')
  }
  
  async generateScript(topic, duration) {
    const result = await sessions_spawn({
      task: `生成一个关于"${topic}"的播客脚本，时长约${duration}。
      
      要求：
      1. 包含开场白、主体对话（5-8 轮）、结束语
      2. 对话自然有趣
      3. 适合语音朗读
      
      返回 JSON: {
        intro: "...",
        segments: [{ host: "...", guest: "..." }, ...],
        outro: "..."
      }`,
      mode: 'run',
      timeoutSeconds: 180
    })
    
    return JSON.parse(result.output)
  }
  
  async playIntroMusic() {
    // 模拟：实际可以播放音频文件
    await message({
      action: 'send',
      target: '老大',
      message: '🎵 [开场音乐]'
    })
  }
  
  async playOutroMusic() {
    await message({
      action: 'send',
      target: '老大',
      message: '🎵 [结束音乐]'
    })
  }
}

// 使用
const podcast = new PodcastGenerator()
await podcast.generatePodcast('AI 与未来工作', '10min')

5.2 有声书

javascript

class AudiobookReader {
  constructor() {
    this.chapter = 1
    this.position = 0
  }
  
  async readBook(bookPath) {
    const book = await read({ path: bookPath })
    
    // 解析章节
    const chapters = this.parseChapters(book)
    
    for (let i = 0; i < chapters.length; i++) {
      console.log(`朗读第 ${i + 1} 章`)
      
      // 章节标题
      await tts({ text: `第${i + 1}章 ${chapters[i].title}`, channel: 'telegram' })
      await sleep(2000)
      
      // 章节内容（分段）
      const paragraphs = chapters[i].content.split('\n\n')
      
      for (const para of paragraphs) {
        if (para.trim().length > 50) {
          await tts({ text: para, channel: 'telegram' })
          await sleep(1000)
        }
      }
      
      // 章节间暂停
      await sleep(5000)
    }
  }
  
  parseChapters(content) {
    const chapters = []
    const chapterRegex = /第 [一二三四五六七八九十百\d]+章\s*(.+)/g
    
    let match
    let lastIndex = 0
    
    while ((match = chapterRegex.exec(content)) !== null) {
      if (lastIndex > 0) {
        chapters.push({
          title: chapters[chapters.length - 1].title,
          content: content.slice(lastIndex, match.index).trim()
        })
      }
      
      chapters.push({ title: match[1], content: '' })
      lastIndex = match.index + match[0].length
    }
    
    // 最后一章
    if (lastIndex < content.length) {
      chapters.push({
        title: chapters[chapters.length - 1].title,
        content: content.slice(lastIndex).trim()
      })
    }
    
    return chapters
  }
}

// 使用
const reader = new AudiobookReader()
await reader.readBook('/books/novel.md')

六、最佳实践

6.1 语音优化

javascript

// 文本预处理
function preprocessText(text) {
  return text
    // 移除特殊字符
    .replace(/[^\w\s\u4e00-\u9fa5，。！？、；："'（）《》]/g, '')
    // 规范化标点
    .replace(/(\.|!|\?)/g, '。')
    // 移除多余空格
    .replace(/\s+/g, ' ')
    // 分段
    .split('。')
    .filter(s => s.trim().length > 0)
    .map(s => s + '。')
}

// 使用
const processed = preprocessText(longText)
for (const sentence of processed) {
  await tts({ text: sentence })
  await sleep(500)
}

6.2 速率限制处理

javascript

// TTS 通常有速率限制
const RATE_LIMIT = {
  requests: 10,      // 每...
  window: 60000      // 60 秒
}

class RateLimitedTTS {
  constructor() {
    this.requests = []
  }
  
  async tts(options) {
    // 清理过期请求
    const now = Date.now()
    this.requests = this.requests.filter(t => now - t < RATE_LIMIT.window)
    
    // 检查是否超限
    if (this.requests.length >= RATE_LIMIT.requests) {
      const waitTime = RATE_LIMIT.window - (now - this.requests[0])
      console.log(`速率限制，等待 ${waitTime}ms`)
      await sleep(waitTime)
    }
    
    // 记录请求
    this.requests.push(Date.now())
    
    // 执行 TTS
    return await tts(options)
  }
}

// 使用
const rateLimitedTTS = new RateLimitedTTS()
await rateLimitedTTS.tts({ text: '消息内容' })

6.3 错误恢复

javascript

async function ttsWithRetry(text, maxRetries = 3) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      return await tts({ text, channel: 'telegram' })
    } catch (error) {
      console.error(`TTS 失败 (${i + 1}/${maxRetries}):`, error.message)
      
      if (i < maxRetries - 1) {
        await sleep(2000 * (i + 1))
      }
    }
  }
  
  // 所有重试失败，发送文字版本
  await message({
    action: 'send',
    target: '老大',
    message: `⚠️ 语音生成失败，文字版本：\n\n${text}`
  })
}

七、总结

核心要点

TTS 让 AI 更人性化
长内容分段处理
语音 + 文字混合最佳
注意速率限制
情感语音增强体验

应用场景

场景	推荐方案
短消息	纯文字
长文章	语音 + 摘要
故事讲述	分段语音
早报晚报	语音播报
电影解说	章节式语音
播客	对话式语音

进阶方向

🎙️ 集成 ElevenLabs 等高级 TTS
🎭 实现角色语音切换
🎵 添加背景音乐
📻 构建完整播客系统

🟢🐉 让你的 AI 开口说话吧！

OpenClaw 语音交互与 TTS 实战 ​

概述 ​

一、TTS 基础 ​

1.1 TTS 工具 ​

1.2 支持的语音格式 ​

1.3 语音配置 ​

二、TTS 应用场景 ​

2.1 语音消息 ​

2.2 长篇内容朗读 ​

2.3 实战案例 1：语音早报 ​

2.4 实战案例 2：故事讲述 ​

2.5 实战案例 3：电影解说 ​

三、高级语音功能 ​

3.1 情感语音 ​

3.2 多语言支持 ​

3.3 语音合成队列 ​

四、语音与文字混合 ​

4.1 智能选择 ​

4.2 语音字幕 ​

五、语音内容创作 ​

5.1 播客生成 ​

5.2 有声书 ​

六、最佳实践 ​

6.1 语音优化 ​

6.2 速率限制处理 ​

6.3 错误恢复 ​

七、总结 ​

核心要点 ​

应用场景 ​

进阶方向 ​

OpenClaw 语音交互与 TTS 实战

概述

一、TTS 基础

1.1 TTS 工具

1.2 支持的语音格式

1.3 语音配置

二、TTS 应用场景

2.1 语音消息

2.2 长篇内容朗读

2.3 实战案例 1：语音早报

2.4 实战案例 2：故事讲述

2.5 实战案例 3：电影解说

三、高级语音功能

3.1 情感语音

3.2 多语言支持

3.3 语音合成队列

四、语音与文字混合

4.1 智能选择

4.2 语音字幕

五、语音内容创作

5.1 播客生成

5.2 有声书

六、最佳实践

6.1 语音优化

6.2 速率限制处理

6.3 错误恢复

七、总结

核心要点

应用场景

进阶方向